import os
import pandas as pd
import numpy as np
import feather
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sle.modeling import generate_data, prep_data
from sle.penalization import regularization_range, choose_C, regularization_path, plot_regularization_path, coef_plots_regularized
%load_ext autoreload
%autoreload 2
Predict whether an SLE patient has nefritis or not
= generate_data('imid')
data_all = generate_data('rest') X_test_df
Code for loading original data
= os.path.join('..', 'data', 'processed')
data_dir = feather.read_dataframe(os.path.join(data_dir, 'imid.feather'))
data_all = feather.read_dataframe(os.path.join(data_dir,'rest.feather')) X_test_df
= RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=40)
cv = PowerTransformer(method='box-cox') trf
= data_all[data_all.Class=='SLE'].Nefritis.astype(np.int64)
y_nefritis = data_all[data_all.Class=='SLE'].drop(["Class"]+["Arthritis","Pleurisy","Pericarditis","Nefritis"] + ["dsDNA1"], axis=1) X_nefritis
= X_nefritis + 1 # Some < 0 values > -1. Because negative fluorescence isn't possible, and Box-Cox requires strictly positive values, add ofset
Xp1_nefritis = pd.DataFrame(trf.fit_transform(Xp1_nefritis), index=X_nefritis.index, columns=X_nefritis.columns) X_trf_nefritis
= LogisticRegression(penalty='l1', max_iter = 10000, solver = 'liblinear') clf_lasso
= 100
K = regularization_range(Xp1_nefritis,y_nefritis,trf)
lambda_min, lambda_max = np.logspace(np.log10(1/lambda_min),np.log10(1/lambda_max), K)
Cs_lasso_nefritis = Pipeline([
pipe 'trf', trf),
('clf', clf_lasso)
(
])= [{
params "clf__C": Cs_lasso_nefritis
}]
= GridSearchCV(pipe, params, cv = cv, scoring = 'roc_auc', refit=choose_C) lasso_nefritis
%%time
lasso_nefritis.fit(Xp1_nefritis,y_nefritis)
CPU times: user 4min 53s, sys: 351 ms, total: 4min 53s
Wall time: 5min 49s
GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=40),
estimator=Pipeline(steps=[('trf',
PowerTransformer(method='box-cox')),
('clf',
LogisticRegression(max_iter=10000,
penalty='l1',
solver='liblinear'))]),
param_grid=[{'clf__C': array([28.71916258, 26.78358714, 24.97846301, 23.2949982 , 21.72499329,
20.26080145, 18.89529124, 17.62181185,...
0.21724993, 0.20260801, 0.18895291, 0.17621812, 0.16434161,
0.15326553, 0.14293595, 0.13330254, 0.1243184 , 0.11593975,
0.1081258 , 0.10083849, 0.09404231, 0.08770417, 0.08179321,
0.07628062, 0.07113956, 0.06634499, 0.06187356, 0.05770349,
0.05381447, 0.05018755, 0.04680508, 0.04365057, 0.04070867,
0.03796504, 0.03540633, 0.03302006, 0.03079462, 0.02871916])}],
refit=<function choose_C at 0x7fb3520bd560>, scoring='roc_auc')
Best model:
'mean_test_score'].max() lasso_nefritis.cv_results_[
0.6029809240238928
Score with lambda selected through 1 SE rule:
'mean_test_score'][lasso_nefritis.best_index_] lasso_nefritis.cv_results_[
0.5939512492715618
= regularization_path(Cs_lasso_nefritis, clf_lasso, X_trf_nefritis, y_nefritis) coefs_lasso_nefritis, nnz_coefs_lasso_nefritis
= plot_regularization_path(1/Cs_lasso_nefritis, coefs_lasso_nefritis, nnz_coefs_lasso_nefritis, lasso_nefritis.cv_results_)
ax1, ax2, ax22 #ax22.set_ylim([0.8, 1])
"mean_test_score"], varnames=Xp1_nefritis.columns) coef_plots_regularized(coefs_lasso_nefritis, nnz_coefs_lasso_nefritis, lasso_nefritis.cv_results_[
Non-zero coefficients of the model selected with the 1SE rule:
= X_nefritis.columns)[lambda x: x!=0].sort_values(ascending=False)) (pd.Series(lasso_nefritis.best_estimator_.named_steps.clf.coef_.squeeze(), index
C1q 0.177918
GAPDH 0.157594
dsDNA2 0.116051
CMV 0.110894
Mi2 0.041954
ASCA 0.030033
Nucleosome 0.005160
SMP 0.003712
CollagenII -0.001279
Enolasearg -0.009766
TPO -0.046097
Cardiolipin -0.068313
GBM -0.112145
RNAPolIII -0.144926
dtype: float64